#### Visual Analytics Coursework ####
# Import the Required Modules
import time
start = time.time()
# Set matplotlib to plot in the notebook
%pylab inline
import pandas as pd
import numpy as np
from __future__ import division
import utils
import seaborn
from six.moves import zip
from sklearn import preprocessing
from rpy2.robjects.packages import importr
# Set the Default Seaborn Colours
seaborn.set()
colors = seaborn.color_palette()
seaborn.set_context(rc={"figure.figsize": (12, 12)})
# Turn off Pandas Future Warnings
pd.set_option('chained_assignment',None)
from collections import OrderedDict
import numpy as np
from bokeh.charts import Histogram
from bokeh.plotting import *
output_notebook()
# Import the Data - Using Pandas
Data_CF = pd.read_csv('Crossfit_Open_2011_Dataset.csv',
sep= ',')
# Check the Import for Errors
print(Data_CF.head(5))
# Get a List of Column Headers for Reference
Column_Names = Data_CF.columns
# Print Column names into a List
[x for x in Column_Names]
# Get a subset of the Data Set of Features for Analysis
Data_CF_VA = Data_CF[['athlete_ID',
'First_Name',
'Last_Name',
'Region',
'age',
'Gender',
'Height_cm',
'Weight_kg',
' overall-points',
'overall-rank',
' score1',
'rank1',
' score2',
' rank2',
' score3',
'rank3',
' score4',
'rank4',
' score5',
'rank5',
' score6',
'rank6']]
# Get some Summary Statistics of the Data
Data_CF_VA.describe().T
# Plot each feature on a history gram - Phase 1 of Methodology
Data_CF_VA_Columns = Data_CF_VA.columns
# Pandas Histogram Plots - Height (Transformed Variable)
Data_CF_VA['Height_cm'].hist(bins=100);
plt.title('Height Distribution');
plt.ylabel('Frequency');
plt.xlabel('Height (cm)');
plt.show()
# Clearly Outliers Exists - Accomadation for this will need to be applied
# Pandas Histogram Plots - Age
Data_CF_VA['age'].hist(bins=24)
plt.title('Age Distribution'); plt.ylabel('# Athletes'); plt.xlabel('Age'); plt.show()
# Pandas Histogram Plots - Weight_kg
Data_CF_VA['Weight_kg'].hist(bins=100);
plt.title('Weight (kg) Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Weight (kilograms)');
plt.show()
# Pandas Histogram Plots - Overall Points
Data_CF_VA[Data_CF_VA_Columns[8]].hist(bins=100);
plt.title('Final Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 100 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 1
Data_CF_VA[' score1'].hist(bins=50); plt.title('Week 1 Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 2
Data_CF_VA[' score2'].hist(bins=50);
plt.title('Week 2 Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 3
Data_CF_VA[' score3'].hist(bins=50);
plt.title('Week 3 Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 4
Data_CF_VA[' score4'].hist(bins=50);
plt.title('Week 4 Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 5
Data_CF_VA[' score5'].hist(bins=50);
plt.title('Week 5 Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Pandas Histogram Plots - Score After Week 5.1
Data_CF_VA[' score6'].hist(bins=50);
plt.title('Week 5.1 Points Distribution');
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.show()
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Phase 1 Conclusions
#
# Age - The histogram plot conveys a distribution that I would expect, whereby its heavily dominated by the number of
# athletes in the age of 20-40.
# No requirement to normalise the data yet or outlier removal since the data is distributed as expected.
#
# Height - The histogram plot clearly indentifies a number of athletes that have erroneous/incorrect values for their height.
# Outliers will need to be removed and then the plot will need to be reassesed
# Transform in the form of normalisation or standardisation may be required.
#
# Weight - The plot appears to be well distributed amoungst the bins, when changeing the number of bins it can be noted that
# there are some outliers - removal required since they appear to be dubious. <50kg and >160kg
#
# Overall Points - There appears to be some clear distictions in the distbution
# 1 - A high peak and clear group can be seen where the number of points <500
# 2 - Another group from about 500-3000 points
# 3 - The final set where the number of points >3000
#
# Week 1-5 Scores - The score plots consider all possible values.
# When considering and comparing groups consideration of the missing values, where the score is 0, need to be
# accounted for.
# The number of individuals entering appears to be decreasing from Week 1 to Week 5.
# Further analysis will be need to understand if this really is the case
# Phase 1.1 - Adjustments to Features based on the Conclusions mentioned above
# Dealing with Height
# Height - The tallest man alive is 251cm - Sultan Kösen (Turkey, b.10 December 1982)
# http://www.guinnessworldrecords.com/world-records/tallest-man-living
Height_Rule = Data_CF_VA.Height_cm < 251
Data_CF_VA['Height_Gr_251'] = Data_CF_VA.Height_cm < 251
# Frequency TABLE
pd.Series(Data_CF_VA['Height_Gr_251']).value_counts()
# The output below indicate the number of usable instances where the weight can be included as part of the analysis
# Define a function to create the central tendency about a CI
def confid_int_plot(point, ci, y, color, label):
plot(ci, [y, y], "-", color=color, linewidth=4, label=label)
plot(point, y, "o", color=color, markersize=10)
# Position on the y-axis where the INterval will be plotted
int_y = 500
# Calculate some statistics
d = Data_CF_VA[Data_CF_VA.Height_cm < 251].Height_cm
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d.values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Height Distribution')
plt.ylabel('# Athletes')
plt.xlabel('Height (cm)')
plt.legend(loc="best")
plt.show()
# Adjustment to the Weight Feature to Remove the Erroneous Values
# Calculate some statistics
d = Data_CF_VA.Weight_kg
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes')
plt.xlabel('Weight (kg)')
plt.legend(loc="best")
plt.show()
# Get the scores into a seperate Dataframe
Data_CF_VA_Scores = Data_CF_VA[[' score1',
' score2',
' score3',
' score4',
' score5',
' score6']]
# Convert all the -1 to NAN values
Data_CF_VA_Scores[Data_CF_VA_Scores == -1] = np.nan
# Show Participation for Each Score Submitted
Number_Null = []; Number_Non_Null = []; DF_Length = Data_CF_VA_Scores.shape[0]
# Loop through each column to count values
for col in Data_CF_VA_Scores:
Values = Data_CF_VA_Scores[col].count()
Number_Non_Null.append(Values)
Number_Null.append(DF_Length - Values)
# Convert List to Tuple for Plotting
def totuple(a):
try:
return tuple(totuple(i) for i in a)
except TypeError:
return a
# Convert Ranges to Tuples
Number_Non_Null = totuple(Number_Non_Null)
Number_Null = totuple(Number_Null)
ind = np.arange(len(Number_Non_Null)) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
# Plot Each Type
p1 = plt.bar(ind, Number_Non_Null, width, color='g')
p2 = plt.bar(ind, Number_Null, width, color='y',bottom=Number_Non_Null)
# Annotate the Chart
plt.ylabel('Number of Athletes Participating')
plt.title('Participation per Week')
plt.xticks(ind+width/2., ('Week1', 'Week2', 'Week3', 'Week4', 'Week5', 'Week6') )
plt.legend( (p1[0], p2[0]), ('Score Submitted', 'No Score Submitted'), loc="best")
plt.show()
# There appears to be a consistent drop off in the number of scores submitted - investigate this in terms of
# Percentage change
# Calculate Percentage Changes per Week
def percent_change(old, new):
change = new - old
percentage_change = (change / float(old))
return percentage_change * 100
Number_Non_Null_Change = []
for i in ind:
if i+1 > ind.max():
break
else:
Number_Non_Null_Change.append(percent_change(Number_Non_Null[i], Number_Non_Null[i+1]))
Number_Non_Null_Change = totuple(Number_Non_Null_Change)
# Plot Each Type
p1 = plt.bar(np.arange(len(Number_Non_Null_Change)), Number_Non_Null_Change, width, color='r')
# Annotate the Chart
plt.ylabel('% Change of the Number of Athletes Participating')
plt.ylim( (-20, 20) )
plt.title('Participation per Week')
plt.xticks(ind+width/2., ('Week1 - Week2', 'Week2 - Week3', 'Week3 - Week4', 'Week4 - Week5', 'Week5 - Week6'), rotation=45)
plt.legend(labels = 'Week on Week Change')
plt.show()
# The biggest drops between weeks occurred between 2-3, 4-5 and 5-6.
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score1']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# From the Previous visulisation it can be seen that there exist some outliers - the Green bar indicates a Confidence
# of 4 Standard deviations from the mean
# Calculate some statistics
d = Data_CF_VA_Scores[' score2']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# I will remove those outside this range and re-plot
Score_2_Rule = (d <= m + 4*s) & (d >= m - 4*s)
# Subset using the rule above
d = d[Score_2_Rule]
# Calculate some statistics
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# The resulting Plots now corrects alot of the observed problems of the dataset
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score3']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score4']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Where the data is not - it will be excused as excluding them it would mean that the "Top" Scores would be removed
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score5']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Revisulise the with the Null Values Excluded
# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
# Calculate some statistics
d = Data_CF_VA_Scores[' score6']
m = d.mean()
s = d.std()
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)
# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")
# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution')
plt.ylabel('# Athletes');
plt.xlabel('Points (binned 50 into groups)');
plt.legend(loc="best")
plt.show()
# No adjustment since the distribution of the data in the majority is within the CI
# Apply All of the Changes to the Dataset and obtain Summary Statistics
# An Overview of Changes to Apply to the Main Dataset - Data_CF_VA
# 1 - Change -1 values to NAN
# 2 - Apply the Score2 Rule to exclude the outliers
# 3 - Remove the Erroneous Height Values
# 4 - Remove NAN Overall Points
# 1 - Change -1 values to NAN
for name in Data_CF_VA_Columns:
# Replace only those columns that contain score in the label
if name.startswith(' score'):
Data_CF_VA[name].replace('-1', value = np.nan, inplace = True)
# 2 - Apply the Score2 Rule to exclude the outliers
# Calculate some statistics
d = Data_CF_VA[' score2']
m = d.mean()
s = d.std()
# Rule to Remove +- 4 Standard Deviations from the Dataset
Score_2_Rule = (d <= m + 4*s) & (d >= m - 4*s)
# Exclude values from the Dataset
Data_CF_VA = Data_CF_VA[Score_2_Rule]
# 3 - Remove the Erroneous Height Values
Height_Rule = (Data_CF_VA.Height_cm < 251)
# Exclude values from the Dataset
Data_CF_VA = Data_CF_VA[Height_Rule]
# 4 - Remove NAN Overall Points
Rule4 = pd.notnull(Data_CF_VA[' overall-points'])
Data_CF_VA = Data_CF_VA[Rule4]
# 5 - Remove nan Weight Values
Rule5 = Data_CF_VA['Weight_kg'] >= 0
Data_CF_VA = Data_CF_VA[Rule5]
# Remaining Dataset Size after Outlier and Spurious Data values
print('Remaining Dataset Size\n\nNumber of Rows: %d\nNumber of Features: %d') % (Data_CF_VA.shape[0], Data_CF_VA.shape[1])
# PHASE 2 - From Methodology
# Investigate relationships between Features
# All split by Gender - Splitting them by Gender
Data_CF_VA.Gender.value_counts()
# Proposed Investigation
# Score - Age
# Score - Overall Rank
# Score - Weight
# Region - Score - Box plots
# Weight - Height - Coloured by Overall Rank
# Get Male and Female Datasets
Data_CF_VA_Male = Data_CF_VA[Data_CF_VA.Gender == 'M']
Data_CF_VA_Female = Data_CF_VA[Data_CF_VA.Gender == 'F']
# Define Colours - Female
colorField = Data_CF_VA_Female[' overall-points'].as_matrix()
cm = np.array(["#C7E9B4", "#7FCDBB", "#41B6C4", "#1D91C0", "#225EA8", "#0C2C84"])
ix = ((colorField-colorField.min())/(colorField.max()-colorField.min())*5).astype('int')
colorsF = cm[ix]
# Define Colours - Male
colorField = Data_CF_VA_Female[' overall-points'].as_matrix()
cm = np.array(["#C7E9B4", "#7FCDBB", "#41B6C4", "#1D91C0", "#225EA8", "#0C2C84"])
ix = ((colorField-colorField.min())/(colorField.max()-colorField.min())*5).astype('int')
colorsM = cm[ix]
# Scatter Plots -> Score - Age
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male['age'].astype(int).as_matrix()
TOOLS="resize,crosshair,pan,wheel_zoom,box_zoom,reset,tap,previewsave,box_select,poly_select,lasso_select"
# Create a figure
p1 = figure(tools=TOOLS,
title="Overall Points v Age - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p1.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p1)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female['age'].astype(int).as_matrix()
# Create a figure
p2 = figure(tools=TOOLS,
title="Overall Points v Age - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p2.scatter(x,
y,
fill_color=colorsF,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p2)
# Scatter Plots -> Score - Overall Rank
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score1'].astype(int).as_matrix()
# Create a figure
p3 = figure(tools=TOOLS,
title="Overall Points v Score1 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p3.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p3)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score1'].astype(int).as_matrix()
# Create a figure
p4 = figure(tools=TOOLS,
title="Overall Points v Score1 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p4.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p4)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score2'].astype(int).as_matrix()
# Create a figure
p5 = figure(tools=TOOLS,
title="Overall Points v Score2 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p5.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p5)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score2'].astype(int).as_matrix()
# Create a figure
p6 = figure(tools=TOOLS,
title="Overall Points v Score2 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p6.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p6)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score3'].astype(int).as_matrix()
# Create a figure
p7 = figure(tools=TOOLS,
title="Overall Points v Score3 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p7.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p7)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score3'].astype(int).as_matrix()
# Create a figure
p8 = figure(tools=TOOLS,
title="Overall Points v Score3 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p8.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p8)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score4'].astype(int).as_matrix()
# Create a figure
p9 = figure(tools=TOOLS,
title="Overall Points v Score4 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p9.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p9)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score5'].astype(int).as_matrix()
# Create a figure
p12 = figure(tools=TOOLS,
title="Overall Points v Score5 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p12.scatter(x,
y,
fill_color=colorsM,
fill_alpha=0.6,
line_color=None)
# Show plot in Ipython Notebook
show(p12)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score4'].astype(int).as_matrix()
# Create a figure
p11 = figure(tools=TOOLS,
title="Overall Points v Score4 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p11.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p11)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score5'].astype(int).as_matrix()
# Create a figure
p13 = figure(tools=TOOLS,
title="Overall Points v Score5 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p13.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p13)
# Scatter Plots -> Score - Weight
# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score6'].astype(int).as_matrix()
# Create a figure
p14 = figure(tools=TOOLS,
title="Overall Points v Score6 - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p14.scatter(x,
y,
fill_color=colorsM,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p14)
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score6'].astype(int).as_matrix()
# Create a figure
p15 = figure(tools=TOOLS,
title="Overall Points v Score6 - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p15.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p15)
# Scatter Plots -> Weight - Height
# Define Ranges
y = Data_CF_VA_Male['Height_cm'].astype(int).as_matrix()
x = Data_CF_VA_Male['Weight_kg'].astype(int).as_matrix()
# Create a figure
p16 = figure(tools=TOOLS,
title="Weight v height - Male",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p16.scatter(x,
y,
fill_color=colorsM,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p16)
# Define Ranges
y = Data_CF_VA_Female['Height_cm'].astype(int).as_matrix()
x = Data_CF_VA_Female['Weight_kg'].astype(int).as_matrix()
# Create a figure
p17 = figure(tools=TOOLS,
title="Weight v Height - Female",
toolbar_location="left",
plot_width = 800,
plot_height= 800)
# Scatter plot creation
p17.scatter(x,
y,
fill_color=colorsF,
fill_alpha=1,
line_color=None)
# Show plot in Ipython Notebook
show(p17)
# PHASE 3 - Describing the relationships
# Get the scores into a seperate Dataframe
Data_CF_VA_Scores_Male = Data_CF_VA_Male[['overall-rank',
' score1',
' score2',
' score3',
' score4',
' score5',
' score6',
'Weight_kg',
'age',
'Height_cm']]
Data_CF_VA_Scores_Female = Data_CF_VA_Female[['overall-rank',
' score1',
' score2',
' score3',
' score4',
' score5',
' score6',
'Weight_kg',
'age',
'Height_cm']]
Actual_Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.copy()
Actual_Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.copy()
# Correlation Plot - Male
seaborn.corrplot(Data_CF_VA_Scores_Male.dropna())
# Correlation Plot - Female
seaborn.corrplot(Data_CF_VA_Scores_Female.dropna())
# PHASE 4 - Exploring the findings iteratively through interaction of Computational Techniques
# Autoencoder - Dimensionality Reduction
# Prepare the Data - Normalise
# Train the Neural Network
# Reconstruct to Test Performance of the Autoencoder
# Encoder to get two features
# Export Data to R and Use h2o with Tableau
# An overview of Autoencoder
# Reasoning for use:
# Advantage of using PCA is restricted to the linearity assumption, whereas an auto encoders can have nonlinear enoder/decoders.
print('An Example of an Autoencoder is shown below\n')
print('In order to use this Autoencoder the following steps are:\n')
print('1 - Training the Networks - where the Input layer is the same as the output layer\n')
print('2 - Access the Reconstruction Error - Ensure the error is low before moving on\n')
print('3 - Encoding - Encoder the data on both the Male and Females datasets')
from IPython.display import Image
Image(filename='autoencoder.png')
# Denoising Autoencoder - To be used for Dimensionality Reduction
# Denoising Autoencoders (dA)
#
# References :
# - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
# Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
# 2008
#
# - DeepLearningTutorials
# https://github.com/lisa-lab/DeepLearningTutorials
#
# - Yusuke Sugomori: Stochastic Gradient Descent for Denoising Autoencoders,
# http://yusugomori.com/docs/SGD_DA.pdf
import sys
import numpy
numpy.seterr(all='ignore')
def sigmoid(x):
return 1. / (1 + numpy.exp(-x))
class dA(object):
def __init__(self, input=None, n_visible=2, n_hidden=3, \
W=None, hbias=None, vbias=None, numpy_rng=None):
self.n_visible = n_visible # num of units in visible (input) layer
self.n_hidden = n_hidden # num of units in hidden layer
if numpy_rng is None:
numpy_rng = numpy.random.RandomState(1234)
if W is None:
a = 1. / n_visible
initial_W = numpy.array(numpy_rng.uniform( # initialize W uniformly
low=-a,
high=a,
size=(n_visible, n_hidden)))
W = initial_W
if hbias is None:
hbias = numpy.zeros(n_hidden) # initialize h bias 0
if vbias is None:
vbias = numpy.zeros(n_visible) # initialize v bias 0
self.numpy_rng = numpy_rng
self.x = input
self.W = W
self.W_prime = self.W.T
self.hbias = hbias
self.vbias = vbias
# self.params = [self.W, self.hbias, self.vbias]
def get_corrupted_input(self, input, corruption_level):
assert corruption_level < 1
return self.numpy_rng.binomial(size=input.shape,
n=1,
p=1-corruption_level) * input
# Encode
def get_hidden_values(self, input):
return sigmoid(numpy.dot(input, self.W) + self.hbias)
# Decode
def get_reconstructed_input(self, hidden):
return sigmoid(numpy.dot(hidden, self.W_prime) + self.vbias)
def train(self, lr=0.1, corruption_level=0.3, input=None):
if input is not None:
self.x = input
x = self.x
tilde_x = self.get_corrupted_input(x, corruption_level)
y = self.get_hidden_values(tilde_x)
z = self.get_reconstructed_input(y)
L_h2 = x - z
L_h1 = numpy.dot(L_h2, self.W) * y * (1 - y)
L_vbias = L_h2
L_hbias = L_h1
L_W = numpy.dot(tilde_x.T, L_h1) + numpy.dot(L_h2.T, y)
self.W += lr * L_W
self.hbias += lr * numpy.mean(L_hbias, axis=0)
self.vbias += lr * numpy.mean(L_vbias, axis=0)
def negative_log_likelihood(self, corruption_level=0.3):
tilde_x = self.get_corrupted_input(self.x, corruption_level)
y = self.get_hidden_values(tilde_x)
z = self.get_reconstructed_input(y)
cross_entropy = - numpy.mean(
numpy.sum(self.x * numpy.log(z) +
(1 - self.x) * numpy.log(1 - z),
axis=1))
return cross_entropy
def reconstruct(self, x):
y = self.get_hidden_values(x)
z = self.get_reconstructed_input(y)
return z
def test_dA(learning_rate=0.1, corruption_level=0.1, training_epochs=500):
data = numpy.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
rng = numpy.random.RandomState(123)
# construct dA
da = dA(input=data, n_visible=20, n_hidden=2, numpy_rng=rng)
# train
for epoch in xrange(training_epochs):
da.train(lr=learning_rate, corruption_level=corruption_level)
# cost = da.negative_log_likelihood(corruption_level=corruption_level)
# print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost
# learning_rate *= 0.95
# test
x = numpy.array([[1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0]])
print da.reconstruct(x)
# Main
if __name__ == "__main__":
#test_dA()
print('Autoencoder Class built - Ready for testing')
# Prepare the Data
data_M_numerical = Data_CF_VA_Scores_Male.as_matrix()
data_F_numerical = Data_CF_VA_Scores_Female.as_matrix()
# Train the Neural Network - Male
# Define Parameters
learning_rate = 0.001
corruption_level = 0.3
training_epochs = 5000
# Training
Male_Autoencoder = dA(input = data_M_numerical,
n_visible = data_M_numerical.shape[1],
n_hidden = 2)
# Train the Neural Network - Female
# Define Parameters
learning_rate = 0.001
corruption_level = 0.3
training_epochs = 5000
# Training
Female_Autoencoder = dA(input = data_F_numerical,
n_visible = data_F_numerical.shape[1],
n_hidden = 2)
Mcost = Male_Autoencoder.negative_log_likelihood(corruption_level=corruption_level)
print('Male Autoencoder Cost Value: %.4f') % (Mcost)
Fcost = Female_Autoencoder.negative_log_likelihood(corruption_level)
print('Female Autoencoder Cost Value: %.4f') % (Fcost)
# Encoding the data
Male_Autoencoder_Hidden_Values = pd.DataFrame(Male_Autoencoder.get_hidden_values(data_M_numerical),
columns = ['Autoencoder_1','Autoencoder_2'])
Female_Autoencoder_Hidden_Values = pd.DataFrame(Female_Autoencoder.get_hidden_values(data_F_numerical),
columns = ['Autoencoder_1','Autoencoder_2'])
# Plot components - To inspect what the Neural Network Calculated
Female_Autoencoder_Hidden_Values.plot(kind='hexbin', x='Autoencoder_1', y='Autoencoder_2', gridsize=5)
Male_Autoencoder_Hidden_Values.plot(kind='hexbin', x='Autoencoder_1', y='Autoencoder_2', gridsize=5)
# Nothing interesting can be seen from the Autoencoder - will retry later
# No normalisation was applied which may explain for the poor results
# Get the missing Data
Merged_Data = Data_CF[['athlete_ID',
'nameURL',
'First_Name',
'Last_Name',
'Region',
'sex&division',
'Gender']]
# Merge Data togeather - Male
Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.merge(Merged_Data,
how = 'left',
left_index = True,
right_index = True)
# Merge Data togeather - Female
Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.merge(Merged_Data,
how = 'left',
left_index = True,
right_index = True)
# Recalculate Points Total of Remaining Atheletes
Data_CF_VA_Scores_Male['Overall_Points'] = Data_CF_VA_Scores_Male[[' score1', ' score2',' score3',' score4',' score5',' score6']].sum(axis=1)
Data_CF_VA_Scores_Female['Overall_Points'] = Data_CF_VA_Scores_Female[[' score1', ' score2',' score3',' score4',' score5',' score6']].sum(axis=1)
# Create Percentiles
Data_CF_VA_Scores_Male['Percentiles100'] = pd.qcut(Data_CF_VA_Scores_Male.Overall_Points.as_matrix(),
q = 100,
labels=False)
Data_CF_VA_Scores_Female['Percentiles100'] = pd.qcut(Data_CF_VA_Scores_Female.Overall_Points.as_matrix(),
q = 100,
labels=False)
Actual_Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.copy()
Actual_Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.copy()
Data_CF_VA_Scores_Male.describe().T
Data_CF_VA_Scores_Female.describe().T
# Exploring the Dataset further by creating to similiarity matrices per dataset
# 1 - Similarity Matrix of [Age, Weight, Height]
# 2 - Similarity Matrix of [Score1, Score2, Score3, Score4, Score5, Score6]
# Number of Components Required
n_components = 2
# Convert Dataframes to Numpy Arrays
# Get Lists for Subsetting
Athele_Characteristics = ['Weight_kg', 'age', 'Height_cm']
Scores_List = [' score1', ' score2', ' score3', ' score4', ' score5', ' score6']
# Get Arrays - All four
AC_Male = Data_CF_VA_Scores_Male[Athele_Characteristics]
AC_Female = Data_CF_VA_Scores_Female[Athele_Characteristics]
SL_Male = Data_CF_VA_Scores_Male[Scores_List]
SL_Female = Data_CF_VA_Scores_Female[Scores_List]
# Using Multidimensional Scaling - Dimensionality Reduction
# Import the Module
from sklearn import manifold
# Define a Function for ease of Processing
def mds_function(X, n_components, name):
# Time the Processing
t0 = time.time()
# Define the Function
mds = manifold.MDS(n_components, max_iter=100, n_init=1)
# Produce the Matrix of n-components
Y = mds.fit_transform(X)
# Time the Processing
t1 = time.time()
print("MDS Transformation of %s: %.2f sec" % (str(name),(t1 - t0)))
# Return the Outputted Array
return Y
# Apply the Dimensionality Reduction Technique
AC_Male_Trans = pd.DataFrame(mds_function(AC_Male.as_matrix(),
n_components,
'AC Male'), columns = ['Component_1_AC_Male_Trans','Component_2_AC_Male_Trans'])
AC_Female_Trans = pd.DataFrame(mds_function(AC_Female.as_matrix(),
n_components,
'AC Female'), columns = ['Component_1_AC_Female_Trans','Component_2_AC_Female_Trans'])
SL_Male_Trans = pd.DataFrame(mds_function(SL_Male.as_matrix(),
n_components,
'SL_Male'), columns = ['Component_1_SL_Male_Trans','Component_2_SL_Male_Trans'])
SL_Female_Trans = pd.DataFrame(mds_function(SL_Female.as_matrix(),
n_components,
'SL_Female'), columns = ['Component_1_SL_Female_Trans','Component_2_SL_Female_Trans'])
# Dimensionality Reduction Algorithm 2 - Autoencoder Attempt 2
# Normalise Columns - Using the Function to Loop through columns
def normalize_columns(df):
# Iterate through each column
for feature_name in df.columns:
# Find the Min and Max - Use for scaling
max_val = df[feature_name].max()
min_val = df[feature_name].min()
df[feature_name] = (df[feature_name] - min_val) / (max_val - min_val)
# Return Dataframe Back
return df
# Apply to Dataframe
AC_Male_Trans_2 = normalize_columns(AC_Male)
AC_Female_Trans_2 = normalize_columns(AC_Female)
SL_Male_Trans_2 = normalize_columns(SL_Male)
SL_Female_Trans_2 = normalize_columns(SL_Female)
# Train the Neural Network on Each Array
# Training
Female_Autoencoder_1 = dA(input = AC_Female_Trans_2.as_matrix(),
n_visible = AC_Female_Trans_2.shape[1],
n_hidden = 2)
# Training
Male_Autoencoder_1 = dA(input = AC_Male_Trans_2.as_matrix(),
n_visible = AC_Male_Trans_2.shape[1],
n_hidden = 2)
# Training
Female_Autoencoder_2 = dA(input = SL_Female_Trans_2.as_matrix(),
n_visible = SL_Female_Trans_2.shape[1],
n_hidden = 2)
# Training
Male_Autoencoder_2 = dA(input = SL_Male_Trans_2.as_matrix(),
n_visible = SL_Male_Trans_2.shape[1],
n_hidden = 2)
# Encoding the data
AC_Male_Trans_2 = pd.DataFrame(Male_Autoencoder_1.get_hidden_values(AC_Male_Trans_2.as_matrix()),
columns = ['Autoencoder_1_AC_Male_Trans_2','Autoencoder_2_AC_Male_Trans_2'])
AC_Female_Trans_2 = pd.DataFrame(Female_Autoencoder_1.get_hidden_values(AC_Female_Trans_2.as_matrix()),
columns = ['Autoencoder_1_AC_Female_Trans_2','Autoencoder_2_AC_Female_Trans_2'])
SL_Male_Trans_2 = pd.DataFrame(Male_Autoencoder_2.get_hidden_values(SL_Male_Trans_2.as_matrix()),
columns = ['Autoencoder_1_SL_Male_Trans_2','Autoencoder_2_SL_Male_Trans_2'])
SL_Female_Trans_2 = pd.DataFrame(Female_Autoencoder_2.get_hidden_values(SL_Female_Trans_2.as_matrix()),
columns = ['Autoencoder_1_SL_Female_Trans_2','Autoencoder_2_SL_Female_Trans_2'])
# Plot Each of the Encodings to see if they are usable...
AC_Male_Trans_2.plot(kind='scatter', x='Autoencoder_1_AC_Male_Trans_2', y='Autoencoder_2_AC_Male_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...AC_Male_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
AC_Female_Trans_2.plot(kind='scatter', x='Autoencoder_1_AC_Female_Trans_2', y='Autoencoder_2_AC_Female_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...AC_Female_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
SL_Male_Trans_2.plot(kind='scatter', x='Autoencoder_1_SL_Male_Trans_2', y='Autoencoder_2_SL_Male_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...SL_Male_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
SL_Female_Trans_2.plot(kind='scatter', x='Autoencoder_1_SL_Female_Trans_2', y='Autoencoder_2_SL_Female_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...SL_Female_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
# From inspection each of the plots appear to be usable as a means of Dimensionality Reduction.
# Plotting and the use of other visual encoding will be applied in Tableau to ascertain if there is anything significant
# Clustering Algorithm - Agglomerative clustering
from sklearn.cluster import AgglomerativeClustering
# X is the Numpy Array
# n is the number of Clusters
def allogmerative_clustering(X, n):
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n)
t0 = time.time()
clustering.fit(X)
print("%s took: %.2f seconds" % ('Complete linkage', time.time() - t0))
# Plot Clusters
plt.scatter(X[:,0], X[:,1], c=clustering.labels_+100)
# Display the plot
plt.show()
# Return the Labels
return clustering.labels_
# Clustering Algorithm - Agglomerative clustering - MDS Datasets
AC_Male_Trans['Labels_AC_Male_Trans_MDS'] = allogmerative_clustering(AC_Male_Trans.as_matrix(), 4)
AC_Female_Trans['Labels_AC_Female_Trans_MDS'] = allogmerative_clustering(AC_Female_Trans.as_matrix(), 4)
SL_Male_Trans['Labels_SL_Male_Trans_MDS'] = allogmerative_clustering(SL_Male_Trans.as_matrix(), 4)
SL_Female_Trans['Labels_SL_Female_Trans_MDS'] = allogmerative_clustering(SL_Female_Trans.as_matrix(), 4)
# Clustering Algorithm - Agglomerative clustering - Autoencoder Datasets
# Encoding the data
AC_Male_Trans_2['Labels_AC_Male_Trans_2_Auto'] = allogmerative_clustering(AC_Male_Trans_2.as_matrix(), 4)
AC_Female_Trans_2['Labels_AC_Female_Trans_2_Auto'] = allogmerative_clustering(AC_Female_Trans_2.as_matrix(), 4)
SL_Male_Trans_2['Labels_SL_Male_Trans_2_Auto'] = allogmerative_clustering(SL_Male_Trans_2.as_matrix(), 4)
SL_Female_Trans_2['Labels_SL_Female_Trans_2_Auto'] = allogmerative_clustering(SL_Female_Trans_2.as_matrix(), 4)
# Collating Data togeather - Sharing Index
SL_Male_Trans_2.index = AC_Male_Trans_2.index = SL_Male_Trans.index = AC_Male_Trans.index = SL_Male.index = AC_Male.index
SL_Female_Trans_2.index = AC_Female_Trans_2.index = SL_Female_Trans.index = AC_Female_Trans.index = SL_Female.index = AC_Female.index
# Get Additional Information
Final_Cols = Data_CF_VA_Scores_Male.columns - Athele_Characteristics - Scores_List
Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female[Final_Cols]
Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male[Final_Cols]
# Merge Male Datasets
# Transformed Data
Data_CF_VA_Male_Final = AC_Male.merge(SL_Male ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(Data_CF_VA_Scores_Male ,how = 'left', left_index = True, right_index= True)
# Merge Original
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(Actual_Data_CF_VA_Scores_Male ,how = 'left', left_index = True, right_index= True)
# Merge Labels and Dimensionality Reduced Data
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(AC_Male_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(SL_Male_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(AC_Male_Trans_2 ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(SL_Male_Trans_2 ,how = 'left', left_index = True, right_index= True)
# Merge Female Datasets
# Transformed Data
Data_CF_VA_Female_Final = AC_Female.merge(SL_Female ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(Data_CF_VA_Scores_Female ,how = 'left', left_index = True, right_index= True)
# Merge Original
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(Actual_Data_CF_VA_Scores_Female ,how = 'left', left_index = True, right_index= True)
# Merge Labels and Dimensionality Reduced Data
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(AC_Female_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(SL_Female_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(AC_Female_Trans_2 ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(SL_Female_Trans_2 ,how = 'left', left_index = True, right_index= True)
# Save both Datasets - Male and Female
Data_CF_VA_Male_Final.to_csv('Subsetted_Male_Dataset.csv',
sep = ',',
index = False)
Data_CF_VA_Female_Final.to_csv('Subsetted_Female_Dataset.csv',
sep = ',',
index = False)
# Time to Process
print('Time to Process Script: %.5f Minutes') % ((time.time() - start)/60)
# PHASE 5 - Final Enhancements and Visulisations
# The final phase will involve Tableau for comparisons and presentation of results
Image(filename='tableau.png')